Neural Networks: Implementing Backpropagation¶
Code explanation¶
Description¶
This project is a simple implementation of a neural network built from scratch using NumPy. It is designed to demonstrate the core concepts of forward propagation, backpropagation, and gradient descent.
Structure¶
1. Helper Functions:
- get_activation(name): Returns a tuple containing the specified activation function (func) and its gradient (grad).
- get_loss(name): Returns a tuple containing the loss function (loss) and its gradient (grad).
2. MyNeuralNetwork Class
- init(): Initializes the network. Sets up layer sizes, randomly initializes weights, sets biases to zero, and prepares the activation/loss functions. It also calls initialize_log_file().
- initialize_log_file(): Creates the CSV log file and writes the header row. The header includes epoch, loss, accuracy, and dynamic columns for the Z-value (pre-activation) and activation-value of each neuron in the hidden layer (e.g., z_hidden_0, a_hidden_0, ...).
- log_training_step(): Appends a new row of data to the CSV file for the current training step.
- forward(X): Performs a forward pass through the network, calculating and storing the hidden layer and output layer values. Returns the final predictions and the hidden layer's Z-values.
- backward(X, y, y_hat, ...): Performs backpropagation to calculate the gradients of the loss with respect to all weights and biases using the chain rule.
- train(X, y, ...): The main training loop. It iterates for the specified number of epochs, performs forward and backward passes, updates the weights and biases, and logs progress to the console (every 100 epochs) and the CSV file (every epoch).
3. Training Log
- The output CSV file (default: statistics.csv) provides a complete, epoch-by-epoch trace of the network's internal state.
How to use it¶
- Prepare your training data: The network needs input data (X) and target labels (y).
- Initialize the Neural Network: Create an instance of the MyNeuralNetwork class. Define your network's architecture and settings.
- Train the network: Call the .train() method to begin training. Pass your data (X_train, y_train) and specify the epochs (how many times to loop over the data) and the learning_rate.
# Import libraries
import numpy as np
import csv
import os
import plotly.io as pio
pio.renderers.default = 'notebook'
# Activation and loss functions
def get_activation(name):
if name == "sigmoid":
def func(x):
return 1 / (1 + np.exp(-x))
def grad(x):
return x * (1 - x)
elif name == "softmax":
def func(x):
exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
return exp_x / np.sum(exp_x, axis=1, keepdims=True)
def grad(x):
return x * (1 - x)
else:
raise ValueError(f"Unknown activation: {name}")
return func, grad
def get_loss(name):
if name == "mean_squared_error":
def loss(y_true, y_pred):
return np.mean((y_true - y_pred) ** 2)
def grad(y_true, y_pred):
return (y_pred - y_true)
elif name == "cross_entropy":
def loss(y_true, y_pred):
eps = 1e-7
y_pred = np.clip(y_pred, eps, 1 - eps)
return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))
def grad(y_true, y_pred):
return (y_pred - y_true)
else:
raise ValueError(f"Unknown loss function: {name}")
return loss, grad
# Define the Neural Network class
class MyNeuralNetwork:
def __init__(
self,
input_size = 8,
hidden_size = 3,
output_size = 8,
hidden_activation="sigmoid",
output_activation="softmax",
loss_function="cross_entropy",
output_file = 'statistics.csv',
delete_old_file = True
):
# Generating the weights - Initial random values
self.weights_input_hidden = np.random.randn(input_size, hidden_size)
self.weights_hidden_output = np.random.randn(hidden_size, output_size)
# Generating the biases - Initial zeros
self.bias_input_hidden = np.zeros((1, hidden_size))
self.bias_hidden_output = np.zeros((1, output_size))
# Get activation and loss functions
self.hidden_act, self.hidden_grad = get_activation(hidden_activation)
self.output_act, self.output_grad = get_activation(output_activation)
self.loss_func, self.loss_grad = get_loss(loss_function)
self.configuration = f'{hidden_activation}_{output_activation}_{loss_function}'
# Defining the output file for the statistics
self.output_file = output_file
# Delete old file if exists
if delete_old_file:
if os.path.exists(self.output_file):
os.remove(self.output_file)
# Initialize log file
self.initialize_log_file()
#Logging functions
def initialize_log_file(self):
# Create headers for the CSV file
headers = ['epoch', 'loss', 'accuracy', 'configuration', 'iteration', 'learning_rate']
# Add columns for each hidden neuron's Z and activation
for i in range(self.weights_input_hidden.shape[1]): # For each hidden neuron
headers.extend([f'z_hidden_{i}', f'a_hidden_{i}'])
# Add a column for each weight from input to hidden
for i in range(self.weights_input_hidden.shape[0]):
for j in range(self.weights_input_hidden.shape[1]):
headers.extend([f'w_input_hidden_{i}_{j}'])
# Add columns for each weight from hidden to output
for i in range(self.weights_hidden_output.shape[0]):
for j in range(self.weights_hidden_output.shape[1]):
headers.extend([f'w_hidden_output_{i}_{j}'])
# Add columns for each bias from input to hidden
for i in range(self.bias_input_hidden.shape[1]):
headers.extend([f'b_input_hidden_{i}'])
# Add columns for each bias from hidden to output
for i in range(self.bias_hidden_output.shape[1]):
headers.extend([f'b_hidden_output_{i}'])
# Write headers to file
if not os.path.exists(self.output_file):
with open(self.output_file, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
def log_training_step(self,
epoch, loss, accuracy, iteration,
hidden_z, hidden_activations, learning_rate,
w_input_hidden, b_input_hidden, w_hidden_output, b_hidden_output):
# Prepare row data
row = [epoch, loss, accuracy, self.configuration, iteration, learning_rate]
# Add Z and activation for each hidden neuron
for z, a in zip(hidden_z[0], hidden_activations[0]):
row.extend([z, a])
# Add weights from input to hidden
for i in range(w_input_hidden.shape[0]):
for j in range(w_input_hidden.shape[1]):
row.extend([w_input_hidden[i][j]])
# Add weights from hidden to output
for i in range(w_hidden_output.shape[0]):
for j in range(w_hidden_output.shape[1]):
row.extend([w_hidden_output[i][j]])
# Add biases from input to hidden
for i in range(b_input_hidden.shape[1]):
row.extend([b_input_hidden[0][i]])
# Add biases from hidden to output
for i in range(b_hidden_output.shape[1]):
row.extend([b_hidden_output[0][i]])
# Append to CSV
with open(self.output_file, 'a', newline='') as f:
writer = csv.writer(f)
writer.writerow(row)
# Forward propagation
def forward(self, X):
# Calculating the hidden layer
# Store Z values
self.z_hidden = np.dot(X, self.weights_input_hidden) + self.bias_input_hidden
# Calculate activation
self.hidden_layer = self.hidden_act(self.z_hidden)
# Calculating the output layer
self.z_output = np.dot(self.hidden_layer, self.weights_hidden_output) + self.bias_hidden_output
self.output_layer = self.output_act(self.z_output)
return self.output_layer, self.z_hidden
# Backward propagation
def backward(self, X, y, y_hat, learning_rate = 0.01):
m = X.shape[0]
# -------------------------------- Output layer error --------------------------------
# The derivative of the loss function with respect to weights
if self.configuration.endswith("softmax_cross_entropy"):
error_term_output = y_hat - y
else:
dL_dyhat = self.loss_grad(y, y_hat)
d_act_output = self.output_grad(y_hat)
error_term_output = dL_dyhat * d_act_output
# The derivative of the loss function with respect to weights is the product of the error term and the hidden layer and then we divide by the number of samples
dW_hidden_output = np.dot(self.hidden_layer.T, error_term_output) / m
# The derivative of the loss function with respect to bias is the sum of the error term and then we divide by the number of samples
db_hidden_output = np.sum(error_term_output, axis=0, keepdims=True) / m
# -------------------------------- Hidden layer error --------------------------------
# The derivative of the loss function with respect to weights
error_term_hidden = np.dot(error_term_output, self.weights_hidden_output.T) * self.hidden_grad(self.hidden_layer)
# The derivative of the loss function with respect to weights
dW_input_hidden = np.dot(X.T, error_term_hidden) / m
# The derivative of the loss function with respect to bias
db_input_hidden = np.sum(error_term_hidden, axis=0, keepdims=True) / m
# -------------------------------- Update weights and biases --------------------------------
# Update weights and biases
self.weights_hidden_output -= learning_rate * dW_hidden_output
self.bias_hidden_output -= learning_rate * db_hidden_output
self.weights_input_hidden -= learning_rate * dW_input_hidden
self.bias_input_hidden -= learning_rate * db_input_hidden
# Training the neural network
def train(self, X, y, epochs=10000, learning_rate=0.01, iteration=0):
for epoch in range(epochs):
# Forward pass
y_hat, z_hidden = self.forward(X)
# Backward pass
self.backward(X, y, y_hat, learning_rate)
if epoch % 100 == 0:
# Calculate metrics
loss = self.loss_func(y, y_hat)
predictions = np.argmax(y_hat, axis=1)
accuracy = np.mean(predictions == np.argmax(y, axis=1))
# Log to console
# print(f"Epoch {epoch}, Loss: {loss:.4f}, Accuracy: {accuracy:.4f}, Configuration: {self.configuration}")
# Log to CSV
self.log_training_step(
epoch=epoch,
loss=loss,
accuracy=accuracy,
iteration=iteration,
hidden_z=z_hidden,
hidden_activations=self.hidden_layer,
learning_rate=learning_rate,
w_input_hidden=self.weights_input_hidden,
b_input_hidden=self.bias_input_hidden,
w_hidden_output=self.weights_hidden_output,
b_hidden_output=self.bias_hidden_output,
)
Neural Network Analysis¶
# Import libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tqdm import tqdm
# Define parameters
input_size = 8
hidden_size = 3
output_size = 8
X = np.array([
[1, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 1]
])
y = np.array([
[1, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 1]
])
Initial parameters effect¶
This graph compares how different random initializations of weights affect the convergence of the network. Each color represents a different random seed used for initialization.
Random initialization¶
The effect of random initialization is moderate: while the starting point influences the early training dynamics, the network converges reliably to similar minima in all runs. This suggests that the chosen architecture and learning parameters allow stable learning across different initial conditions.
# Define output file name
file_name1 = 'statistics_sigmoid_mse.csv'
# Create and train the neural network
for i in tqdm(range(5)):
nn = MyNeuralNetwork(input_size=input_size,
hidden_size=hidden_size,
output_size=output_size,
hidden_activation="sigmoid",
output_activation="sigmoid",
loss_function="mean_squared_error",
output_file=file_name1,
delete_old_file=(i==0)
)
nn.train(X, y, epochs=10000, learning_rate=0.2, iteration=i)
0%| | 0/5 [00:00<?, ?it/s]
20%|███████████████ | 1/5 [00:00<00:03, 1.13it/s]
40%|██████████████████████████████ | 2/5 [00:01<00:02, 1.15it/s]
60%|█████████████████████████████████████████████ | 3/5 [00:02<00:01, 1.18it/s]
80%|████████████████████████████████████████████████████████████ | 4/5 [00:03<00:00, 1.20it/s]
100%|███████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00, 1.20it/s]
100%|███████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00, 1.18it/s]
# Read the CSV file
df1 = pd.read_csv(file_name1)
# Plot training loss over epochs
fig = px.line(
df1,
x="epoch",
y="loss",
color="iteration",
title="Training Loss over Epochs",
labels={
"epoch": "Epoch",
"loss": "Training Loss",
"iteration": "Experiment Run"
}
)
fig.update_layout(
height=500,
width=900,
margin=dict(t=100)
)
fig.show()
# Plot training accuracy over epochs
fig = px.line(
df1,
x="epoch",
y="accuracy",
color="iteration",
title="Training Accuracy over Epochs",
labels={
"epoch": "Epoch",
"accuracy": "Training Accuracy",
"iteration": "Experiment Run"
}
)
fig.update_layout(
height=500,
width=900,
margin=dict(t=100)
)
fig.show()
Learning Rate Effect¶
This plot shows how the training loss decreases across epochs for different learning rates. Each line represents a separate training session using a different learning rate value
# Define output file name
file_name2 = 'statistics_learning_rate.csv'
# Create and train the neural network
for i in tqdm(range(5)):
nn = MyNeuralNetwork(input_size=input_size,
hidden_size=hidden_size,
output_size=output_size,
hidden_activation="sigmoid",
output_activation="sigmoid",
loss_function="mean_squared_error",
output_file=file_name2,
delete_old_file=(i==0)
)
learning_rate = round( 10 + (10 * i), 2)
nn.train(X, y, epochs=500, learning_rate=learning_rate, iteration=i)
0%| | 0/5 [00:00<?, ?it/s]
60%|█████████████████████████████████████████████ | 3/5 [00:00<00:00, 21.79it/s]
100%|███████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 22.33it/s]
# Read the CSV file
df2 = pd.read_csv(file_name2)
# Plot training loss over epochs
fig = px.line(
df2,
x="epoch",
y="loss",
color="learning_rate",
title="Training Loss over Epochs By Learning Rate",
labels={
"epoch": "Epoch",
"loss": "Training Loss",
"iteration": "Experiment Run"
}
)
fig.update_layout(
height=500,
width=900,
margin=dict(t=100)
)
fig.show()
Higher learning rates lead to faster convergence than lower learning rates converge more slowly, requiring more epochs to reach similar loss values. However, very high learning rates may risk overshooting the minimum or instability in some cases, though in this test they remained stable.
The learning rate strongly influences convergence speed. A moderate learning rate achieved the best balance between stability and convergence time, while smaller values slowed learning and larger ones risked divergence.
Weights & Biases Analysis¶
This plot shows how the weights and biases evolve across epochs during a training session.
Weights¶
The first graph shows how weights evolve and stabilize over epochs. Even when the initial weights are incorrect, they progressively adjust through training until they converge, indicating the network’s ability to self-correct and optimize its internal representations.
# Define output file name
file_name4 = 'statistics_weights_bias.csv'
# Create and train the neural network
nn = MyNeuralNetwork(
input_size=input_size,
hidden_size=hidden_size,
output_size=output_size,
hidden_activation="sigmoid",
output_activation="softmax",
loss_function="cross_entropy",
output_file=file_name4
)
nn.train(X, y, epochs=10000, learning_rate=0.5)
# Read the CSV file
df = pd.read_csv('statistics_weights_bias.csv')
# Create a 2x4 grid of subplots
fig = make_subplots(rows=2, cols=2,
subplot_titles=[f'Neuron j={j}' for j in range(3)],
vertical_spacing=0.1,
horizontal_spacing=0.1)
colors = [
'#1f77b4', # muted blue
'#ff7f0e', # safety orange
'#2ca02c', # cooked asparagus green
'#d62728', # brick red
'#9467bd', # muted purple
'#8c564b', # chestnut brown
'#e377c2', # raspberry yogurt pink
'#7f7f7f' # middle gray
]
# For each j from 0 to 3
for j in range(3):
# Calculate the row and column position
row = (j // 2) + 1
col = (j % 2) + 1
# Add a trace for each i
for i in range(8):
col_name = f'w_input_hidden_{i}_{j}'
fig.add_trace(
go.Scatter(
x=df['epoch'],
y=df[col_name],
mode='lines',
name=f'i={i}',
showlegend=(j == 0),
line=dict(color=colors[i], width=2),
),
row=row, col=col
)
if col == 1:
fig.update_yaxes(title_text="Weight", row=row, col=col)
if row == 2:
fig.update_xaxes(title_text="Epoch", row=row, col=col)
# Update layout
fig.update_layout(
title_text="Weight Changes Over Epochs (Input Layer [i] to Hidden Layer [j])",
height=500,
width=900,
legend_title="i value",
margin=dict(t=100)
)
# Show the plot
fig.show()
# Create a 2x4 grid of subplots
fig = make_subplots(rows=4, cols=2,
subplot_titles=[f'Neuron i={i}' for i in range(3)],
vertical_spacing=0.1,
horizontal_spacing=0.1)
colors = [
'#1f77b4',
'#ff7f0e',
'#2ca02c',
]
# For each j from 0 to 8
for j in range(8):
# Calculate the row and column position
row = (j // 2) + 1
col = (j % 2) + 1
# Add a trace for each j from 0 to 3
for i in range(3):
col_name = f'w_hidden_output_{i}_{j}'
fig.add_trace(
go.Scatter(
x=df['epoch'],
y=df[col_name],
mode='lines',
name=f'j={j}',
showlegend=(i == 0),
line=dict(color=colors[i], width=2),
),
row=row, col=col
)
if col == 1:
fig.update_yaxes(title_text="Weight", row=row, col=col)
if row == 2:
fig.update_xaxes(title_text="Epoch", row=row, col=col)
# Update layout
fig.update_layout(
title_text="Weight Changes Over Epochs (Hidden Layer [i] to Output Layer [j])",
height=900,
width=900,
legend_title="i value",
margin=dict(t=100)
)
# Show the plot
fig.show()
Biases¶
This graph shows how bias values evolve during training. In both layers, biases adjust progressively to help neurons activate appropriately, compensating for input imbalances. Early fluctuations reflect the network learning optimal offsets, while later stabilization indicates convergence and refined activation thresholds.
# Read the CSV file
df = pd.read_csv("statistics_weights_bias.csv")
# Create subplots (2x2 for readability)
fig = make_subplots(
rows=1, cols=2,
subplot_titles=["Input Layer to Hidden Layer", "Hidden Layer to Output Layer"],
vertical_spacing=0.1,
horizontal_spacing=0.1
)
# Hidden layer biases
for j in range(3):
col_name = f"b_input_hidden_{j}"
fig.add_trace(
go.Scatter(
x=df["epoch"], y=df[col_name],
mode="lines", name=f"b_input_hidden_{j}",
line=dict(width=2)
),
row=1, col=1
)
# Output layer biases
for j in range(8):
col_name = f"b_hidden_output_{j}"
fig.add_trace(
go.Scatter(
x=df["epoch"], y=df[col_name],
mode="lines", name=f"b_hidden_output_{j}",
line=dict(width=2)
),
row=1, col=2
)
# Layout
fig.update_layout(
title_text="Evolution of Bias Values During Training",
height=500,
width=800,
legend_title="Bias Variables",
margin=dict(t=80),
)
# Axes titles
fig.update_xaxes(title_text="Epoch", row=1, col=1)
fig.update_xaxes(title_text="Epoch", row=1, col=2)
fig.update_yaxes(title_text="Bias", row=1, col=1)
fig.show()
Comparative of algorithms¶
Comparative between two activation functions and two loss functions:
- Option 1:
- Hidden layer activation function: Sigmoid function
- Output layer activation function: Sigmoid function
- Loss function: Mean Square Error
- Option 2:
- Hidden layer activation function: Sigmoid function
- Output layer activation function: Softmax function
- Loss function: Cross entropy
file_name3 = 'statistics_softmax_ce.csv'
# Create and train the neural network
for i in tqdm(range(5)):
nn = MyNeuralNetwork(input_size=input_size,
hidden_size=hidden_size,
output_size=output_size,
hidden_activation="sigmoid",
output_activation="softmax",
loss_function="cross_entropy",
output_file=file_name3,
delete_old_file=(i==0)
)
nn.train(X, y, epochs=10000, learning_rate=0.2, iteration=i)
# Read the CSV file
df2 = pd.read_csv(file_name3)
0%| | 0/5 [00:00<?, ?it/s]
20%|███████████████ | 1/5 [00:00<00:03, 1.05it/s]
40%|██████████████████████████████ | 2/5 [00:01<00:02, 1.13it/s]
60%|█████████████████████████████████████████████ | 3/5 [00:02<00:01, 1.18it/s]
80%|████████████████████████████████████████████████████████████ | 4/5 [00:03<00:00, 1.21it/s]
100%|███████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00, 1.20it/s]
100%|███████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00, 1.18it/s]
# Read log file
df = pd.concat([df1, df2], ignore_index=True)
df = df[df['iteration'] == 2]
new_labels_map = {
'sigmoid_sigmoid_mean_squared_error': 'Option 1',
'sigmoid_softmax_cross_entropy': 'Option 2'
}
df['configuration'] = df['configuration'].replace(new_labels_map)
# Plot training loss over epochs
fig = px.line(
df,
x="epoch",
y="loss",
color="configuration",
title="Training Loss over Epochs"
)
fig.update_layout(
xaxis_title="Epochs",
yaxis_title="Loss",
legend_title="Configuration",
template="plotly_white",
height=500,
width=900,
margin=dict(t=100)
)
fig.show()
When comparing both configurations, the network with sigmoid activations and mean squared error achieved faster convergence and lower final loss, suggesting a better match between activation and loss functions for this specific task. In contrast, the softmax–cross entropy configuration required more epochs to stabilize and remained at a higher loss level, possibly due to the binary nature of the output.
The activation and loss functions should align with the problem type to ensure efficient learning.
# Plot accuracy loss over epochs
fig = px.line(
df,
x="epoch",
y="accuracy",
color="configuration",
title="Accuracy over Epochs"
)
fig.update_layout(
xaxis_title="Epochs",
yaxis_title="Accuracy",
legend_title="Configuration",
template="plotly_white",
height=500,
width=900,
margin=dict(t=100)
)
fig.show()